#converts your phylip file into a one hot encoded file for VAE
#modified from https://elferachid.medium.com/one-hot-encoding-dna-92a1c29ba15a
#assumes your phylip headers don't have any spaces
#usage: python one_hot_encode.py input.phylip output.txt

import numpy as np
import sys

def onehote(seq): #define one hot encoding
	seq2 = list() #make empty list to store new values
	mapping = {"A":"1.0,0.0,0.0,0.0", 
	"C": "0.0,1.0,0.0,0.0", 
	"G": "0.0,0.0,1.0,0.0", 
	"T":"0.0,0.0,0.0,1.0", 
	"R": "0.5,0.0,0.5,0.0", 
	"Y": "0.0,0.5,0.0,0.5", 
	"S": "0.0,0.5,0.5,0.0", 
	"W": "0.5,0.0,0.0,0.5", 
	"K": "0.0,0.0,0.5,0.5", 
	"M": "0.5,0.5,0.0,0.0"} #create mapping key
	for i in seq:
		seq2.append(mapping[i]  if i in mapping.keys() else "0.0,0.0,0.0,0.0")
	return np.array(seq2) #for each value in your sequence use mapping key otherwise put zeros as missing values

infile = sys.argv[1] #argument 1 as input file
outfile = sys.argv[2] #argument 2 as output file
with open (outfile, 'a') as f: #open your output file
	with open(infile, 'r') as DNAfile: #open your input file
		rows = DNAfile.readlines()[1:] #skip the first line of phylip which is matrix dimensions
		for line in rows: #loop through lines in your input file
			cols = line.split() #split the header and sequence in your phylip
			DNA = cols[1] #store sequence
			NAME = cols[0] #store header
			DNAencoded = onehote(DNA) #one hot encode your sequence
			string = ' '.join(str(x) for x in DNAencoded) #convert your one hot array to string
			print(NAME, string, sep = '\t' , file = f) #append your file with header, a tab, and then your sequence